# create initial split
set.seed(100)
test_split <- initial_split(data_clean, prop = 0.8, strata = "attrition")
# create validation split
set.seed(100)
val_split <- initial_split(training(test_split), prop = 0.8, strata = "attrition")
# quick check
test_split#> <1177/293/1470>
# define preprocess recipe from train dataset
rec <- recipe(attrition ~ ., data = training(val_split)) %>%
step_rm(employee_count, employee_number) %>%
step_nzv(all_predictors()) %>%
step_string2factor(all_nominal(), -attrition) %>%
step_string2factor(attrition, levels = c("yes", "no")) %>%
step_downsample(attrition, ratio = 1/1, seed = 100) %>%
step_center(all_numeric()) %>%
step_scale(all_numeric()) %>%
step_dummy(all_nominal(), -attrition, one_hot = FALSE) %>%
prep(strings_as_factors = FALSE)
# get train-val-test dataset
data_train <- juice(rec)
data_val <- bake(rec, testing(val_split))
data_test <- bake(rec, testing(test_split))
# quick check
head(data_train, 10)# prepare train arrays
data_train_y <- to_categorical(as.numeric(data_train$attrition) - 1)
data_train_x <- data_train %>%
select(-attrition) %>%
data.matrix()
# prepare validation arrays
data_val_y <- to_categorical(as.numeric(data_val$attrition) - 1)
data_val_x <- data_val %>%
select(-attrition) %>%
data.matrix()
# prepare test arrays
data_test_y <- to_categorical(as.numeric(data_test$attrition) - 1)
data_test_x <- data_test %>%
select(-attrition) %>%
data.matrix()
# quick check
dim(data_train_x)#> [1] 306 44
#> [1] 306 2
# define input
input <- layer_input(name = "input", shape = ncol(data_train_x))
# define hidden layers
hiddens <- input %>%
layer_dense(name = "dense_1", units = 64) %>%
layer_activation_leaky_relu(name = "dense_1_act") %>%
layer_batch_normalization(name = "dense_1_bn") %>%
layer_dropout(name = "dense_1_dp", rate = 0.15) %>%
layer_dense(name = "dense_2", units = 32) %>%
layer_activation_leaky_relu(name = "dense_2_act") %>%
layer_batch_normalization(name = "dense_2_bn") %>%
layer_dropout(name = "dense_2_dp", rate = 0.15)
# define output
output <- hiddens %>%
layer_dense(name = "output", units = ncol(data_train_y)) %>%
layer_batch_normalization(name = "output_bn") %>%
layer_activation(name = "output_act", activation = "sigmoid")
# define full model
model <- keras_model(inputs = input, outputs = output)
# compile the model
model %>% compile(
optimizer = optimizer_adam(lr = 0.001),
metrics = "accuracy",
loss = "binary_crossentropy"
)
# model summary
summary(model)#> ___________________________________________________________________________
#> Layer (type) Output Shape Param #
#> ===========================================================================
#> input (InputLayer) (None, 44) 0
#> ___________________________________________________________________________
#> dense_1 (Dense) (None, 64) 2880
#> ___________________________________________________________________________
#> dense_1_act (LeakyReLU) (None, 64) 0
#> ___________________________________________________________________________
#> dense_1_bn (BatchNormalizationV1 (None, 64) 256
#> ___________________________________________________________________________
#> dense_1_dp (Dropout) (None, 64) 0
#> ___________________________________________________________________________
#> dense_2 (Dense) (None, 32) 2080
#> ___________________________________________________________________________
#> dense_2_act (LeakyReLU) (None, 32) 0
#> ___________________________________________________________________________
#> dense_2_bn (BatchNormalizationV1 (None, 32) 128
#> ___________________________________________________________________________
#> dense_2_dp (Dropout) (None, 32) 0
#> ___________________________________________________________________________
#> output (Dense) (None, 2) 66
#> ___________________________________________________________________________
#> output_bn (BatchNormalizationV1) (None, 2) 8
#> ___________________________________________________________________________
#> output_act (Activation) (None, 2) 0
#> ===========================================================================
#> Total params: 5,418
#> Trainable params: 5,222
#> Non-trainable params: 196
#> ___________________________________________________________________________
# predict on test
pred_test <- as_tibble(predict(model, data_test_x)) %>%
set_names(levels(data_train$attrition)) %>%
mutate(class = ifelse(yes > 0.5, "yes", "no")) %>%
mutate(class = factor(class, levels = levels(data_train$attrition))) %>%
set_names(paste0(".pred_", colnames(.)))
# combine with test dataset
pred_test <- data_test %>%
select(attrition) %>%
bind_cols(pred_test)
# quick check
head(pred_test, 10)# metrics summary
pred_test %>%
summarise(
accuracy = accuracy_vec(attrition, .pred_class),
sensitivity = sens_vec(attrition, .pred_class),
specificity = spec_vec(attrition, .pred_class),
precision = precision_vec(attrition, .pred_class)
)# get roc curve data on test dataset
pred_test_roc <- pred_test %>%
roc_curve(attrition, .pred_yes)
# tidying
pred_test_roc <- pred_test_roc %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold)
# plot sensitivity-specificity trade-off
p <- ggplot(pred_test_roc, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)# get pr curve data on test dataset
pred_test_pr <- pred_test %>%
pr_curve(attrition, .pred_yes)
# tidying
pred_test_pr <- pred_test_pr %>%
mutate_if(~ is.numeric(.), ~ round(., 4)) %>%
gather(metric, value, -.threshold)
# plot recall-precision trade-off
p <- ggplot(pred_test_pr, aes(x = .threshold, y = value)) +
geom_line(aes(colour = metric)) +
labs(x = "Probability Threshold to be Classified as Positive", y = "Value", colour = "Metrics") +
theme_minimal()
ggplotly(p)# set-up preprocess function
get_features <- function(x) {
data.matrix(bake(rec, x, -attrition))
}
# set-up model as lime's classifier
lime_model <- as_classifier(model, levels(data_train$attrition))# set-up lime explainer
explainer <- lime(
x = data_explain,
model = lime_model,
preprocess = get_features
)
# get lime explanation
explanation <- lime::explain(
x = data_explain,
explainer = explainer,
n_labels = 2,
n_features = 5
)
# plot feature explanation
plot_features(explanation, cases = 1, ncol = 1)